#importing modules
import sys
import os
import csv
import re
import rmv_similar
import subprocess
from copy import *
from scipy import *
from subprocess import*
from Bio import SeqIO


WRKNG_DIR =os.getcwd()+"/"
PATH_IN =os.getcwd()+"/" #You can specify a different ouput directory

PATH_OUT=os.getcwd()+"/" #You can specify a different ouput directory

name_of_file="DG_CVE.fas"
#name_of_file="Anello_bE.fas"
#name_of_file="Anello_cE.fas"
#name_of_file="Anello_eE.fas"
#name_of_file="Parvo_aE.fas"
#name_of_file="Parvo_bE.fas"
#name_of_file="Parvo_cE.fas"
#name_of_file="Parvo_eE.fas"


window_size=100 #Specify the window size
ori_seqs_num=49 #Specify the number of sequences within the original alignment before performing recombination test
LenSeq=2046	#Specify the sequence length
 
alignment_list=[]
alignment_handler=open(PATH_IN+name_of_file,'r')
for seq_obj in SeqIO.parse(alignment_handler,'fasta'):
    alignment_list.append(seq_obj)
alignment_handler.close

if len(alignment_list)<ori_seqs_num:
	print "Error: the number of sequences from the original alignment should be less than the number of sequences within the correspoding distributed alignment"
	sys.exit(1)
seq_full_len =len(alignment_list[0].seq)

def SplitWindows(seq_list,st_index,en_index):
    list_copy = deepcopy(seq_list)
    spltd_seq=[]
    for seq_obj in list_copy:
        seq_obj.seq=seq_obj.seq[st_index:en_index]
        spltd_seq.append(seq_obj) 
    ls_indexes=[]

    for i in range(len(spltd_seq)):
        ls_indexes.append((i,spltd_seq[i].seq.count('-')))
      
    dtype=[('index',int),('n_g',int)]
    A=array(ls_indexes,dtype=dtype)
    new_A = sort(A, order='n_g')[:ori_seqs_num]
    my_indexes=[]
    
    for a in new_A:
        my_indexes.append(a[0])
    my_indexes.sort()
    my_seq=[]
    for i in my_indexes:
        my_seq.append(spltd_seq[i])
    return my_seq
    
def CompareSeqIds(DataSetOne,DataSetTwo):
    SetOneIds=[]
    SetTwoIds=[]
    for i in range(len(DataSetOne)):
            SetOneIds.append(DataSetOne[i].id)            
            SetTwoIds.append(DataSetTwo[i].id)
    if SetOneIds==SetTwoIds:
            return 1
    else:
            return 0

saved_filenames=[]

def SaveSeqFiles(original_alignment, fst_pos, lst_pos):
    
    dataset_tosave=[]
    increament=0
    
    while fst_pos < (lst_pos-window_size):
               
        if CompareSeqIds( SplitWindows(original_alignment,fst_pos+increament,fst_pos+increament+window_size), SplitWindows(original_alignment,fst_pos+increament+1,fst_pos+increament+window_size+1))==1:
           increament+=1
           if fst_pos+increament==lst_pos-window_size:
               dataset_tosave=SplitWindows(original_alignment,fst_pos,fst_pos+increament+window_size)
               name=PATH_OUT+"spltd_%d_%d.fasta"%(fst_pos,fst_pos+increament+1)
               saved_filenames.append(name)
               output=open(name,'w')
               SeqIO.write(dataset_tosave,output,'fasta')
               output.close()
               break
           else:
               continue 
        else: 
           dataset_tosave=SplitWindows(original_alignment,fst_pos,fst_pos+increament+window_size)
           name=PATH_OUT+"spltd_%d_%d.fasta"%(fst_pos,fst_pos+increament+1)
           saved_filenames.append(name)
           output=open(name,'w')
           SeqIO.write(dataset_tosave,output,'fasta')
           output.close() 
           
           fst_pos=fst_pos+increament+1
           increament=0
           print 'break pos', fst_pos
    print "saved files", len (saved_filenames)
    return saved_filenames 
                
def DrawMLTreeParallel(phylip_files):
    k=1
    process_list=[]
    for i in range(0,len(phylip_files)):
        in_file_name=phylip_files[i]
        cmd=['C:/My Programs/PhyML_3.0/PhyML_3.0_win32.exe','-i', in_file_name]
        proc_name=Popen(cmd,stdout=PIPE,stderr=PIPE)
        process_list.append(proc_name)
    print len(phylip_files)
    for proc in process_list:
        (output, error)=proc.communicate()
        return_code = proc.wait()
        if return_code != 0:
            sys.stderr.write('Error occured running PhyML')
            sys.stderr.write(error)
        else:
            print 'tree no.%d' %k
            k+=1
    print 'All ML trees saved'
  
def DrawMLTree(phylip_files):
    j=1
    for i in range(0,len(phylip_files)):
        in_file_name=phylip_files[i]
        cmd=['C:/My Programs/PhyML_3.0/PhyML_3.0_win32.exe','-i', in_file_name]
        proc=Popen(cmd,stdout=PIPE,stderr=PIPE)
        (output, error)=proc.communicate()
        return_code = proc.wait()
        if return_code != 0:
           sys.stderr.write('Error occured running PhyML')
           sys.stderr.write(error)   
        print 'tree no.%d' %j
        j+=1

phylip_files_saved=[]

def ConvFastaToPhy(file_names_list):#input a list of file names
    for i in range(len(file_names_list)):
        conv_name=file_names_list[i].split('.')[0]
        name_of_file=conv_name+'.phy'
        phylip_files_saved.append(name_of_file)
        count=SeqIO.convert(file_names_list[i],'fasta', name_of_file, "phylip")
    return phylip_files_saved
 
 
def RMVTempFile():
     all_temp_files=os.listdir(PATH_OUT)
     for temp_file in all_temp_files:
         if temp_file.endswith('phyml_stats.txt') or temp_file.endswith('.phy') or temp_file.endswith('phyml_tree.txt'):
             os.remove(temp_file)
     for fil in saved_filenames:
         os.remove(fil)
            
def RMVSimilarSequences(seq_files):
    for file_name in seq_files:
        new_records=[]
        curr_records=[]
        file_handler=open(file_name,'r')
        for seq_rec in SeqIO.parse(file_handler,'fasta'):
            curr_records.append(seq_rec)
        file_handler.close
          
             
        j=1
        num_rec=len(curr_records)
        
        for i in range(len(curr_records)):
            while j < len(curr_record):            
                if curr_records[i].seq==curr_records[j+i].seq:
                    del curr_records[j+i]
                    j+=1
                else:
                    j+=1
        name=file_name.split('.')[0]+'E.fasta'
        output=open(PATH_OUT+name,'w')
        SeqIO.write(dataset_tosave,output,'fasta')
        output.close()   
        
               
files_to_analyse=[]            
def RMVSimSeqs(seq_files):
        for file_name in seq_files:
            new_records=[]
            curr_records=[]
            file_handler=open(file_name,'r')
            for seq_rec in SeqIO.parse(file_handler,'fasta'):
                curr_records.append(seq_rec)
            file_handler.close
              
            k=1
            i=0
            num_rec=len(curr_records)
            similar_rec=[]
            
            for i in range (0, num_rec):
                for j in range(i+1,num_rec):
                    if str(curr_records[i].seq)==str(curr_records[j].seq):
                        if j not in similar_rec:
                            similar_rec.append(j)
         
            for r in range(len(similar_rec)):
                curr_records[similar_rec[r]] = None
            for n in range(len(curr_records)):
                if curr_records[n]!=None:
                    new_records.append(curr_records[n])
            #Removing gap seqs 
            for new_rec in new_records:
                if new_rec.seq.count('-')== len(new_rec.seq):
                    new_records.remove(new_rec)
                    
            name=file_name.split('.')[0]+'s.fasta'
            files_to_analyse.append(name)
            output=open(name,'w')
            SeqIO.write(new_records,output,'fasta')
            output.close()
        return files_to_analyse

#Removing the additional annotations that PHYML adds
def resaveTree(inputfile,outputfile):
    f = open(inputfile, "r") 
    lines = f.readlines()
    f.close()
    for i in range(0,len(lines)):
        lines[i] = re.sub("""[0-9]\.[0-9]+\:""", ":", lines[i])
    outfile = open(outputfile,"w")
    for line in lines:
        outfile.write(line+"\n")
    outfile.close()          
   
def PhyMLAnalysis( alignment_data,st_pstn,en_pstn):
    DrawMLTree(ConvFastaToPhy(RMVSimSeqs(SaveSeqFiles(alignment_data,st_pstn,en_pstn))))
    RMVSimSeqs(saved_filenames)
    
    #Resaving phyml trees
    all_files=os.listdir(PATH_OUT)
    for fil in all_files:
        if fil.endswith('phyml_tree.txt'):
            name_out=fil.split('.')[0]+".nwk"
            resaveTree(PATH_OUT+fil,PATH_OUT+name_out)
    #RMVTempFile()         
    print '----------------------------\nPhyML Trees Resaved'
    print '---------------------------\nPhyML Analysis Completed' 
    
def removeTMP(path):
    files=os.listdir(path)
    for i in files:
        if i.endswith("fasta") or i.endswith("txt"):
            os.remove(path+i)





#================================================ Main program ============================================

PhyMLAnalysis( alignment_list,0,LenSeq)
removeTMP(PATH_OUT)


         
         
         
         
         
         
         
         
         
         
         
         
         
         
                
        
        
        
    
    
    
